1 /* 2 * Copyright (C) 2009 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.xml; 18 19 import com.google.common.annotations.Beta; 20 import com.google.common.annotations.GwtCompatible; 21 import com.google.common.escape.Escaper; 22 import com.google.common.escape.Escapers; 23 24 /** 25 * {@code Escaper} instances suitable for strings to be included in XML 26 * attribute values and elements' text contents. When possible, avoid manual 27 * escaping by using templating systems and high-level APIs that provide 28 * autoescaping. For example, consider <a href="http://www.xom.nu/">XOM</a> or 29 * <a href="http://www.jdom.org/">JDOM</a>. 30 * 31 * <p><b>Note:</b> Currently the escapers provided by this class do not escape 32 * any characters outside the ASCII character range. Unlike HTML escaping the 33 * XML escapers will not escape non-ASCII characters to their numeric entity 34 * replacements. These XML escapers provide the minimal level of escaping to 35 * ensure that the output can be safely included in a Unicode XML document. 36 * 37 * 38 * <p>For details on the behavior of the escapers in this class, see sections 39 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and 40 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the 41 * XML specification. 42 * 43 * @author Alex Matevossian 44 * @author David Beaumont 45 * @since 15.0 46 */ 47 @Beta 48 @GwtCompatible 49 public class XmlEscapers { 50 private XmlEscapers() {} 51 52 private static final char MIN_ASCII_CONTROL_CHAR = 0x00; 53 private static final char MAX_ASCII_CONTROL_CHAR = 0x1F; 54 55 // For each xxxEscaper() method, please add links to external reference pages 56 // that are considered authoritative for the behavior of that escaper. 57 58 /** 59 * Returns an {@link Escaper} instance that escapes special characters in a 60 * string so it can safely be included in an XML document as element content. 61 * See section 62 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the 63 * XML specification. 64 * 65 * <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not 66 * safe</b> to use this escaper to escape attribute values. Use 67 * {@link #xmlContentEscaper} if the output can appear in element content or 68 * {@link #xmlAttributeEscaper} in attribute values. 69 * 70 * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control 71 * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which 72 * are not permitted in XML. For more detail see section <a 73 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the 74 * XML specification. 75 * 76 * <p>This escaper does not escape non-ASCII characters to their numeric 77 * character references (NCR). Any non-ASCII characters appearing in the input 78 * will be preserved in the output. Specifically "\r" (carriage return) is 79 * preserved in the output, which may result in it being silently converted to 80 * "\n" when the XML is parsed. 81 * 82 * <p>This escaper does not treat surrogate pairs specially and does not 83 * perform Unicode validation on its input. 84 */ 85 public static Escaper xmlContentEscaper() { 86 return XML_CONTENT_ESCAPER; 87 } 88 89 /** 90 * Returns an {@link Escaper} instance that escapes special characters in a 91 * string so it can safely be included in XML document as an attribute value. 92 * See section 93 * <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> 94 * of the XML specification. 95 * 96 * <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control 97 * characters and the character values {@code 0xFFFE} and {@code 0xFFFF} which 98 * are not permitted in XML. For more detail see section <a 99 * href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of the 100 * XML specification. 101 * 102 * <p>This escaper does not escape non-ASCII characters to their numeric 103 * character references (NCR). However, horizontal tab {@code '\t'}, line feed 104 * {@code '\n'} and carriage return {@code '\r'} are escaped to a 105 * corresponding NCR {@code "	"}, {@code "
"}, and {@code "
"} 106 * respectively. Any other non-ASCII characters appearing in the input will 107 * be preserved in the output. 108 * 109 * <p>This escaper does not treat surrogate pairs specially and does not 110 * perform Unicode validation on its input. 111 */ 112 public static Escaper xmlAttributeEscaper() { 113 return XML_ATTRIBUTE_ESCAPER; 114 } 115 116 private static final Escaper XML_ESCAPER; 117 private static final Escaper XML_CONTENT_ESCAPER; 118 private static final Escaper XML_ATTRIBUTE_ESCAPER; 119 static { 120 Escapers.Builder builder = Escapers.builder(); 121 // The char values \uFFFE and \uFFFF are explicitly not allowed in XML 122 // (Unicode code points above \uFFFF are represented via surrogate pairs 123 // which means they are treated as pairs of safe characters). 124 builder.setSafeRange(Character.MIN_VALUE, '\uFFFD'); 125 // Unsafe characters are replaced with the Unicode replacement character. 126 builder.setUnsafeReplacement("\uFFFD"); 127 128 /* 129 * Except for \n, \t, and \r, all ASCII control characters are replaced with 130 * the Unicode replacement character. 131 * 132 * Implementation note: An alternative to the following would be to make a 133 * map that simply replaces the allowed ASCII whitespace characters with 134 * themselves and to set the minimum safe character to 0x20. However this 135 * would slow down the escaping of simple strings that contain \t, \n, or 136 * \r. 137 */ 138 for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) { 139 if (c != '\t' && c != '\n' && c != '\r') { 140 builder.addEscape(c, "\uFFFD"); 141 } 142 } 143 144 // Build the content escaper first and then add quote escaping for the 145 // general escaper. 146 builder.addEscape('&', "&"); 147 builder.addEscape('<', "<"); 148 builder.addEscape('>', ">"); 149 XML_CONTENT_ESCAPER = builder.build(); 150 builder.addEscape('\'', "'"); 151 builder.addEscape('"', """); 152 XML_ESCAPER = builder.build(); 153 builder.addEscape('\t', "	"); 154 builder.addEscape('\n', "
"); 155 builder.addEscape('\r', "
"); 156 XML_ATTRIBUTE_ESCAPER = builder.build(); 157 } 158 }